This is my notebook that I will used to go through the tidy text notebook by Julia Silge and David Robinson.
library(tidyverse)
library(tidytext)
library(stringr)
library(janeaustenr)
text <- c("Because I could not stop for Death - ",
"He kindly stopped for me -",
"The Carriage held but just Ourselves -",
"and Immortality")
text
## [1] "Because I could not stop for Death - "
## [2] "He kindly stopped for me -"
## [3] "The Carriage held but just Ourselves -"
## [4] "and Immortality"
text_df <- data_frame(line = 1:4, text = text)
text_df
text_df %>%
unnest_tokens(word, text) # to_lower = FALSE to keep uppercase
Using the Jane Austen dataset we can make it tidy. First we’ll use mutate to create a column from existing data. Annotate line numbers and then keep track of chapters using regex.
original_books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore.case = TRUE)))) %>%
ungroup()
original_books
Restructure it in one-token-per-row format with the unnest_tokens() function.
tidy_books <- original_books %>%
unnest_tokens(word, text)
tidy_books
We can remove stop words
data(stop_words)
tidy_books <- tidy_books %>%
anti_join(stop_words)
## Joining, by = "word"
tidy_books %>%
count(word, sort = TRUE)
Plotting data
tidy_books %>%
count(word, sort = TRUE) %>%
filter(n > 600) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n))+
geom_col()+
xlab(NULL)+
coord_flip()+
theme_bw()
#### Gutenbergr
HG wells books
library(gutenbergr)
hgwells <- gutenberg_download(c(35, 36, 5230, 159))
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
tidy_hgwells <- hgwells %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
## Joining, by = "word"
tidy_hgwells %>%
count(word, sort = TRUE)
Most commone words in novels by Bronte sisters
bronte <- gutenberg_download(c(1260, 768, 969, 9182, 767))
tidy_bronte <- bronte %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
## Joining, by = "word"
tidy_bronte %>%
count(word, sort = TRUE)
Comparing Bronte and HG Wells
frequency <- bind_rows(mutate(tidy_bronte, author = "Bronte Sisters"),
mutate(tidy_hgwells, author = "H.G. Wells"),
mutate(tidy_books, author = "Jane Austen")) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
count(author, word) %>%
group_by(author) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(author, proportion) %>%
gather(author, proportion, `Bronte Sisters`:`H.G. Wells`)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
ggplot(frequency, aes(x = proportion, y = `Jane Austen`,
color = abs(`Jane Austen` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word, check_overlap = TRUE, vjust = 1.5))+
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0,0.001),
low = "darkslategray4", high = "gray75") +
facet_wrap(~author, ncol = 2) +
theme(legend.position = "none") +
labs(y = "Jane Austen", x = NULL)
## Warning: Ignoring unknown aesthetics: check_overlap
## Warning: Removed 41357 rows containing missing values (geom_point).
## Warning: Removed 41359 rows containing missing values (geom_text).
cor.test(data = frequency[frequency$author == "Bronte Sisters",],
~ proportion + `Jane Austen`)
##
## Pearson's product-moment correlation
##
## data: proportion and Jane Austen
## t = 119.65, df = 10404, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7527846 0.7689620
## sample estimates:
## cor
## 0.7609915
cor.test(data = frequency[frequency$author == "H.G. Wells",],
~ proportion + `Jane Austen`)
##
## Pearson's product-moment correlation
##
## data: proportion and Jane Austen
## t = 36.441, df = 6053, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4032800 0.4445987
## sample estimates:
## cor
## 0.4241601
library(tidytext)
sentiments
Sentiment scores
# AFINN
get_sentiments("afinn")
# Bing
get_sentiments("bing")
# NRC
get_sentiments("nrc")
Sentiment analysis can be performed with an inner join
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
tidy_books
Get a list of words that are associated with ‘joy’ and perform an inner_join to find words that are have ‘joy’ sentiment within Jane Austen’s Emma book.
# Filter words associated with 'joy'
nrcjoy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
# Find words associated with joy in Emma
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrcjoy) %>%
count(word, sort = TRUE)
## Joining, by = "word"
Count up positive or negative words that are within sections of each book. We will define an index to keep track of where we are in the narrative. The index will count up sections of 80 lines of text
janeaustensentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill =0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
janeaustensentiment
ggplot(janeaustensentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
pride_prejudice
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(score)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
afinn
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
bing_and_nrc
Comparing three sentiment lexicons using Pride and Prejudice
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1 , scales = "free_y")
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative")) %>%
count(sentiment)
get_sentiments("bing") %>%
count(sentiment)
bing_words_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_words_counts
bing_words_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE)+
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL,
title = "Top 10 words by Sentiment for Pride and Prejudice") +
coord_flip()
## Selecting by n
Miss is referred to as a negative word but is not in Jane Austen’s works.
custom_stop_words <- bind_rows(data_frame(word = c("miss"),
lexicon = c("custom")),
stop_words)
custom_stop_words
library(wordcloud)
## Loading required package: RColorBrewer
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word~sentiment, value.car = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining, by = "word"
## Using n as value column: use value.var to override.
Looking at Units Beyond Just Words
PandP_sentences <- data_frame(text = prideprejudice) %>%
unnest_tokens(sentence, text, token = "sentences")
PandP_sentences$sentence[2]
## [1] "however little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters."
Figuring out how many chapters are in a book
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text , token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter !=0) %>%
top_n(1) %>%
ungroup()
## Joining, by = "word"
## Selecting by ratio
# Chapter 3
Term Frequency in Jane Austen’s Novels
book_words <- austen_books() %>%
unnest_tokens(word,text) %>%
count(book,word,sort = TRUE) %>%
ungroup()
total_words <- book_words %>%
group_by(book) %>%
summarize(total = sum(n))
book_words <- left_join(book_words, total_words)
## Joining, by = "book"
book_words
Term frequency distribution in Jane Austen’s novels
ggplot(book_words, aes(n/total, fill = book)) +
geom_histogram(show.legend = FALSE) +
xlim(NA, 0.0009) +
facet_wrap(~book, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 896 rows containing non-finite values (stat_bin).
Zipf’s Law
Zipf’s Law: frequency that a word appears is inversely proportional to its rank.
freq_by_rank <- book_words %>%
group_by(book) %>%
mutate(rank = row_number(),
'term frequency' = n/total)
freq_by_rank
Graph of Zipf’s law for Jane Austen’s novels
freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = book)) +
geom_line(size = 1.1, alpha = 0.8, show.legend = TRUE) +
scale_x_log10() +
scale_y_log10()
rank_subset <- freq_by_rank %>%
filter(rank <500,
rank >10)
lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
##
## Call:
## lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
##
## Coefficients:
## (Intercept) log10(rank)
## -0.6226 -1.1125
freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = book))+
geom_abline(intercept = -0.62, slope = -1.1, color = "gray50", linetype = 2) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()
The bind_tf_idf Function
book_words <- book_words %>%
bind_tf_idf(word, book, n)
book_words
book_words %>%
select(-total) %>%
arrange(desc(tf_idf))
book_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(book) %>%
top_n(15) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill = book)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~book, ncol = 2,scales = "free") +
coord_flip()
## Selecting by tf_idf
A corpus of Physics Text
physics <- gutenberg_download(c(37729, 14725, 13476, 5001),
meta_fields = "author")
physics_words <- physics %>%
unnest_tokens(word, text) %>%
count(author, word, sort = TRUE) %>%
ungroup()
physics_words
author_order <- c("Galilei, Galileo", "Huygens, Christiaan", "Tesla, Nikola", "Einstein, Albert")
plot_physics <- physics_words %>%
bind_tf_idf(word, author, n) %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
mutate(author = factor(author, levels = author_order))
plot_physics
plot_physics %>%
group_by(author) %>%
top_n(15, tf_idf) %>%
ungroup() %>%
mutate(word = reorder(word, tf_idf)) %>%
ggplot(aes(word, tf_idf, fill = author)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~author, ncol = 2, scales = "free") +
coord_flip()
Isolating “eq” from Einstein’s text
physics %>%
filter(str_detect(text, "eq\\.")) %>%
select(text)
K1 was used for the coordinate system for Einstein
physics %>%
filter(str_detect(text, "K1")) %>%
select(text)
physics %>%
filter(str_detect(text, "AK")) %>%
select(text)
mystopwords <- data_frame(word = c("eq","co","rc","ac","ak","bn",
"fig", "file", "cg", "cb", "cm"))
physics_words <- anti_join(physics_words, mystopwords, by = "word")
plot_physics <- physics_words %>%
bind_tf_idf(word, author, n) %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(author) %>%
top_n(15, tf_idf) %>%
ungroup() %>%
mutate(author = factor(author, levels = author_order))
plot_physics
ggplot(plot_physics, aes(word, tf_idf, fill = author)) +
geom_col(show.legend = FALSE) +
labs( x = NULL, y = "tf-idf") +
facet_wrap(~author, ncol = 2, scales = "free") +
coord_flip()
austen_bigrams <- austen_books() %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
austen_bigrams
Counting and Filtering N-grams
austen_bigrams %>%
count(bigram, sort = TRUE)
bigrams_separated <- austen_bigrams %>%
separate(bigram, into = c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigram_counts
bigrams_united <- bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")
bigrams_united
austen_books() %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
separate(trigram, into = c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word) %>%
count(word1, word2, word3, sort = TRUE)
Identifying the “streets” in each book.
bigrams_filtered %>%
filter(word2 == "street") %>%
count(book, word1, sort = TRUE)
bigram_tf_idf <- bigrams_united %>%
count(book, bigram) %>%
bind_tf_idf(bigram, book, n) %>%
arrange(desc(tf_idf))
bigram_tf_idf
The 12 bigrams with the highest tf-idf from each Jane Austen novel.
bigram_tf_idf %>%
arrange(desc(tf_idf)) %>%
mutate(bigram = factor(bigram, levels = rev(unique(bigram)))) %>%
group_by(book) %>%
top_n(12,tf_idf) %>%
ungroup() %>%
ggplot(aes(x = bigram, y = tf_idf, fill = book))+
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free")+
coord_flip()
Using Bigrams to Provide Context in Sentiment Analysis
bigrams_separated %>%
filter(word1 == "not") %>%
count(word1, word2, sort = TRUE)
AFINN <- get_sentiments("afinn")
AFINN
not_words <- bigrams_separated %>%
rename(word = word2) %>%
filter(word1 == "not") %>%
inner_join(AFINN) %>%
count(word, score, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
not_words